// xBRZ image upscaler/filter - modified for the Kega Fusion Emulator

// Original xbrz.cpp :
// ****************************************************************************
// * This file is part of the HqMAME project. It is distributed under         *
// * GNU General Public License: http://www.gnu.org/licenses/gpl.html         *
// * Copyright (C) Zenju (zenju AT gmx DOT de) - All Rights Reserved          *
// *                                                                          *
// * Additionally and as a special exception, the author gives permission     *
// * to link the code of this program with the MAME library (or with modified *
// * versions of MAME that use the same license as MAME), and distribute      *
// * linked combinations including the two. You must obey the GNU General     *
// * Public License in all respects for all of the code used other than MAME. *
// * If you modify this file, you may extend this exception to your version   *
// * of the file, but you are not obligated to do so. If you do not wish to   *
// * do so, delete this exception statement from your version.                *
// ****************************************************************************

// rpi-plugin creation and C++11 unraveling and some cleaning of the original code
// by "milo1012" (milo1012 AT freenet DOT de)
// no real code changes besides removing lambda functions and some preprocessor mess,
// removing <cstdint> (uint32_t) plus compressing whitespace (tabs instead space,
// bracket style)
// (do we really need lambda functions for this small code and break compatibility
// with the majority of the decent, well-tried, but older C++ compilers out there?
// - I don't think so - we can even get speedups (check the readme))
//
// other code changes:
// - aggressive speedup by removing unnecessary sub-routines for color
//   distance function and converting to float -> double precision is overkill
//   for an 8 bit resolution color model -> MSVC 7.1 et al. seem to favor it
//   -> same function: we probably don't want to calculate three subtractions
//   and two divisions with known constants in every call -> unfold/precalc it
// - removal of elaborate fillBlock() and sub functions -> in place seems faster
// - removal of std::max and std::min -> unnecessary: "std::max()" == "(a<b)?b:a"
//   -> "((x-1<0)?(0):(x-1))" can be changed to "x-((x>0)?(1):(0))"
//   -> definitely less code produced, theoret. faster
// - removal of ScalerCfg() and cfg vars -> we use xBRZ defaults anyway
// - removal of safety checks at scaleImage() start
// ... overall ~2-8 % faster (depending on machine)
//
// currently only for Windows
// for other systems: replace the stuff in DllMain and change or remove
// the threading in scale() and RenderPluginOutput() first

#ifdef _WINDOWS
	#define WIN32_LEAN_AND_MEAN
	//#define NOMINMAX // if using std::max or std:.min
	#include <windows.h>
	//#undef NOMINMAX
#endif
#include <limits>
//#include <algorithm> // if using std::max or std:.min
#ifdef _MSC_VER
#define FORCE_INLINE __forceinline
#elif defined __GNUC__
#define FORCE_INLINE __attribute__((always_inline)) inline
#else
#define FORCE_INLINE inline
#endif

//////////////
// main config: scaling factor (2,3,4) and threading
// don't use other values for _SCALER_ as above (5 not implemented)
const unsigned char _SCALER_ = 4;
// number of threads = image slices scaled parallel in xBRZ - src image size
// must be a multiple of it! - prob. no more gain above 4 slices
const unsigned char NUM_SLICE = 4;
#define _XBRZ_MT_
//#undef _XBRZ_MT_
//////////////

////////////////////////////////////////////////////////////////////////////////
// Kega stuff
#ifndef _WINDOWS
	typedef unsigned long DWORD;
	typedef unsigned short WORD;
	typedef unsigned char BYTE;
	typedef void* HMODULE;
#endif

typedef struct {
	unsigned long Size;
	unsigned long Flags;
	void* SrcPtr;
	unsigned long SrcPitch;
	unsigned long SrcW;
	unsigned long SrcH;
	void* DstPtr;
	unsigned long DstPitch;
	unsigned long DstW;
	unsigned long DstH;
	unsigned long OutW;
	unsigned long OutH;
} RENDER_PLUGIN_OUTP;

typedef void (*RENDPLUG_Output)(RENDER_PLUGIN_OUTP *);

typedef struct {
	char Name[60];
	unsigned long Flags;
	HMODULE Handle;
	RENDPLUG_Output Output;
} RENDER_PLUGIN_INFO;

typedef RENDER_PLUGIN_INFO*(*RENDPLUG_GetInfo)(void);

#define	RPI_VERSION		0x02
#define	RPI_MMX_USED	0x000000100
#define	RPI_MMX_REQD	0x000000200
#define	RPI_555_SUPP	0x000000400
#define	RPI_565_SUPP	0x000000800
#define	RPI_888_SUPP	0x000001000
#define	RPI_DST_WIDE	0x000008000
#define	RPI_OUT_SCL1	0x000010000
#define	RPI_OUT_SCL2	0x000020000
#define	RPI_OUT_SCL3	0x000030000
#define	RPI_OUT_SCL4	0x000040000

#ifdef _WINDOWS
	extern "C" __declspec(dllexport) RENDER_PLUGIN_INFO* RenderPluginGetInfo(void);
	extern "C" __declspec(dllexport) void RenderPluginOutput(RENDER_PLUGIN_OUTP* rpo);
#endif
////////////////////////////////////////////////////////////////////////////////
// end Kega stuff

typedef union {
	unsigned int rgbint;
	BYTE rgbarr[4];
} rgbpixel;

////////////////////////////////////////////////////////////////////////////////
// global vars
rgbpixel* picture_in32;
rgbpixel* picture_out32;
RENDER_PLUGIN_OUTP* MyRPO;
RENDER_PLUGIN_INFO MyRPI;
bool VideoFormat;
unsigned int pitchd;
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////


namespace xbrz {
#ifdef _XBRZ_MT_
	DWORD WINAPI scale(LPVOID);
#else
	void scale();
#endif
} // namespace


namespace {

const unsigned int redMask   = 0xff0000;
const unsigned int greenMask = 0x00ff00;
const unsigned int blueMask  = 0x0000ff;

enum RotationDegree { //clock-wise
	ROT_0,
	ROT_90,
	ROT_180,
	ROT_270
};

enum BlendType {
	BLEND_NONE = 0,
	BLEND_NORMAL,   //a normal indication to blend
	BLEND_DOMINANT, //a strong indication to blend
	//attention: BlendType must fit into the value range of 2 bit!!!
};

struct BlendResult {
	BlendType
	/**/blend_f, blend_g,
	/**/blend_j, blend_k;
};

struct Kernel_4x4 { //kernel for preprocessing step
	unsigned int
	/**/a, b, c, d,
	/**/e, f, g, h,
	/**/i, j, k, l,
	/**/m, n, o, p;
};

struct Kernel_3x3 {
	unsigned int
	/**/a,  b,  c,
	/**/d,  e,  f,
	/**/g,  h,  i;
};




template <unsigned int N, unsigned int M> inline
void alphaBlend(unsigned int& dst, unsigned int col) { //blend color over destination with opacity N / M
	dst = (redMask   & ((col & redMask  ) * N + (dst & redMask  ) * (M - N)) / M) | //this works because 8 upper bits are free
		(greenMask & ((col & greenMask) * N + (dst & greenMask) * (M - N)) / M) |
		(blueMask  & ((col & blueMask ) * N + (dst & blueMask ) * (M - N)) / M);
}


//calculate input matrix coordinates after rotation at compile time
template <RotationDegree rotDeg, size_t I, size_t J, size_t N>
struct MatrixRotation;


template <size_t I, size_t J, size_t N>
struct MatrixRotation<ROT_0, I, J, N> {
	static const size_t I_old = I;
	static const size_t J_old = J;
};


template <RotationDegree rotDeg, size_t I, size_t J, size_t N> //(i, j) = (row, col) indices, N = size of (square) matrix
struct MatrixRotation {
	static const size_t I_old = N - 1 - MatrixRotation<static_cast<RotationDegree>(rotDeg - 1), I, J, N>::J_old; //old coordinates before rotation!
	static const size_t J_old =         MatrixRotation<static_cast<RotationDegree>(rotDeg - 1), I, J, N>::I_old; //
};


template <size_t N, RotationDegree rotDeg>
class OutputMatrix {
	public:
		OutputMatrix(unsigned int* out, int outWidth) : //access matrix area, top-left at position "out" for image with given width
			out_(out),
			outWidth_(outWidth) {}
	template <size_t I, size_t J>
	unsigned int& ref() const {
		static const size_t I_old = MatrixRotation<rotDeg, I, J, N>::I_old;
		static const size_t J_old = MatrixRotation<rotDeg, I, J, N>::J_old;
		return *(out_ + J_old + I_old * outWidth_);
	}
	private:
		unsigned int* out_;
		const int outWidth_;
};


FORCE_INLINE
float distYCbCr(const unsigned int& pix1, const unsigned int& pix2) {
	//if (pix1 == pix2) //about 8% perf boost
		//return 0;
	//http://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion
	//YCbCr conversion is a matrix multiplication => take advantage of linearity by subtracting first!
	const int r_diff = *((unsigned char*)&pix1+2) - *((unsigned char*)&pix2+2); //we may delay division by 255 to after matrix multiplication
	const int g_diff = *((unsigned char*)&pix1+1) - *((unsigned char*)&pix2+1); //
	const int b_diff = *((unsigned char*)&pix1) - *((unsigned char*)&pix2); //substraction for int is noticeable faster than for double!
	//ITU-R BT.709 conversion
	const float y   = 0.2126F * r_diff + 0.7152F * g_diff + 0.0722F * b_diff; //[!], analog YCbCr!
	const float c_b = (b_diff - y) * 0.5389F;
	const float c_r = (r_diff - y) * 0.635F;
	//we skip division by 255 to have similar range like other distance functions
	return std::sqrt(y*y + c_b*c_b +  c_r*c_r);
}


/*
input kernel area naming convention:
-----------------
| A | B | C | D |
----|---|---|---|
| E | F | G | H |   //evalute the four corners between F, G, J, K
----|---|---|---|   //input pixel is at position F
| I | J | K | L |
----|---|---|---|
| M | N | O | P |
-----------------
*/
FORCE_INLINE //detect blend direction
BlendResult preProcessCorners(const Kernel_4x4& ker) { //result: F, G, J, K corners of "GradientType"
	BlendResult result = {};
	if ((ker.f == ker.g && ker.j == ker.k) || (ker.f == ker.j && ker.g == ker.k))
		return result;
	//auto dist = [&](unsigned int col1, unsigned int col2) { return colorDist(col1, col2, cfg.luminanceWeight_); };
	//const int weight = 4;
	float jg = ((ker.i == ker.f)?(0):(distYCbCr(ker.i, ker.f))) + ((ker.f == ker.c)?(0):(distYCbCr(ker.f, ker.c))) + ((ker.n == ker.k)?(0):(distYCbCr(ker.n, ker.k))) + ((ker.k == ker.h)?(0):(distYCbCr(ker.k, ker.h))) + ((ker.j == ker.g)?(0):(4 * distYCbCr(ker.j, ker.g)));
	float fk = ((ker.e == ker.j)?(0):(distYCbCr(ker.e, ker.j))) + ((ker.j == ker.o)?(0):(distYCbCr(ker.j, ker.o))) + ((ker.b == ker.g)?(0):(distYCbCr(ker.b, ker.g))) + ((ker.g == ker.l)?(0):(distYCbCr(ker.g, ker.l))) + ((ker.f == ker.k)?(0):(4 * distYCbCr(ker.f, ker.k)));
	if (jg < fk) { //test sample: 70% of values max(jg, fk) / min(jg, fk) are between 1.1 and 3.7 with median being 1.8
		const bool dominantGradient = 3.6F * jg < fk;
		if (ker.f != ker.g && ker.f != ker.j)
			result.blend_f = dominantGradient ? BLEND_DOMINANT : BLEND_NORMAL;
		if (ker.k != ker.j && ker.k != ker.g)
			result.blend_k = dominantGradient ? BLEND_DOMINANT : BLEND_NORMAL;
	}
	else if (fk < jg) {
		const bool dominantGradient = 3.6F * fk < jg;
		if (ker.j != ker.f && ker.j != ker.k)
			result.blend_j = dominantGradient ? BLEND_DOMINANT : BLEND_NORMAL;
		if (ker.g != ker.f && ker.g != ker.k)
			result.blend_g = dominantGradient ? BLEND_DOMINANT : BLEND_NORMAL;
	}
	return result;
}


//compress four blend types into a single byte
inline BlendType getTopL(unsigned char b) {
	return static_cast<BlendType>(0x3 & b);
}
inline BlendType getTopR(unsigned char b) {
	return static_cast<BlendType>(0x3 & (b >> 2));
}
inline BlendType getBottomR(unsigned char b) {
	return static_cast<BlendType>(0x3 & (b >> 4));
}
inline BlendType getBottomL(unsigned char b) {
	return static_cast<BlendType>(0x3 & (b >> 6));
}
inline void setTopL(unsigned char& b, BlendType bt) {
	b |= bt;
} //buffer is assumed to be initialized before preprocessing!
inline void setTopR(unsigned char& b, BlendType bt) {
	b |= (bt << 2);
}
inline void setBottomR(unsigned char& b, BlendType bt) {
	b |= (bt << 4);
}
inline void setBottomL(unsigned char& b, BlendType bt) {
	b |= (bt << 6);
}


template <RotationDegree rotDeg> inline
unsigned char rotateBlendInfo(unsigned char b) {
	return b;
}
template <> inline unsigned char rotateBlendInfo<ROT_90 >(unsigned char b) {
	return ((b << 2) | (b >> 6)) & 0xff;
}
template <> inline unsigned char rotateBlendInfo<ROT_180>(unsigned char b) {
	return ((b << 4) | (b >> 4)) & 0xff;
}
template <> inline unsigned char rotateBlendInfo<ROT_270>(unsigned char b) {
	return ((b << 6) | (b >> 2)) & 0xff;
}


template <RotationDegree rotDeg> unsigned int inline get_a(const Kernel_3x3& ker) { return ker.a; }
template <RotationDegree rotDeg> unsigned int inline get_b(const Kernel_3x3& ker) { return ker.b; }
template <RotationDegree rotDeg> unsigned int inline get_c(const Kernel_3x3& ker) { return ker.c; }
template <RotationDegree rotDeg> unsigned int inline get_d(const Kernel_3x3& ker) { return ker.d; }
template <RotationDegree rotDeg> unsigned int inline get_e(const Kernel_3x3& ker) { return ker.e; }
template <RotationDegree rotDeg> unsigned int inline get_f(const Kernel_3x3& ker) { return ker.f; }
template <RotationDegree rotDeg> unsigned int inline get_g(const Kernel_3x3& ker) { return ker.g; }
template <RotationDegree rotDeg> unsigned int inline get_h(const Kernel_3x3& ker) { return ker.h; }
template <RotationDegree rotDeg> unsigned int inline get_i(const Kernel_3x3& ker) { return ker.i; }

template <> inline unsigned int get_a<ROT_90>(const Kernel_3x3& ker) { return ker.g; }
template <> inline unsigned int get_b<ROT_90>(const Kernel_3x3& ker) { return ker.d; }
template <> inline unsigned int get_c<ROT_90>(const Kernel_3x3& ker) { return ker.a; }
template <> inline unsigned int get_d<ROT_90>(const Kernel_3x3& ker) { return ker.h; }
template <> inline unsigned int get_e<ROT_90>(const Kernel_3x3& ker) { return ker.e; }
template <> inline unsigned int get_f<ROT_90>(const Kernel_3x3& ker) { return ker.b; }
template <> inline unsigned int get_g<ROT_90>(const Kernel_3x3& ker) { return ker.i; }
template <> inline unsigned int get_h<ROT_90>(const Kernel_3x3& ker) { return ker.f; }
template <> inline unsigned int get_i<ROT_90>(const Kernel_3x3& ker) { return ker.c; }

template <> inline unsigned int get_a<ROT_180>(const Kernel_3x3& ker) { return ker.i; }
template <> inline unsigned int get_b<ROT_180>(const Kernel_3x3& ker) { return ker.h; }
template <> inline unsigned int get_c<ROT_180>(const Kernel_3x3& ker) { return ker.g; }
template <> inline unsigned int get_d<ROT_180>(const Kernel_3x3& ker) { return ker.f; }
template <> inline unsigned int get_e<ROT_180>(const Kernel_3x3& ker) { return ker.e; }
template <> inline unsigned int get_f<ROT_180>(const Kernel_3x3& ker) { return ker.d; }
template <> inline unsigned int get_g<ROT_180>(const Kernel_3x3& ker) { return ker.c; }
template <> inline unsigned int get_h<ROT_180>(const Kernel_3x3& ker) { return ker.b; }
template <> inline unsigned int get_i<ROT_180>(const Kernel_3x3& ker) { return ker.a; }

template <> inline unsigned int get_a<ROT_270>(const Kernel_3x3& ker) { return ker.c; }
template <> inline unsigned int get_b<ROT_270>(const Kernel_3x3& ker) { return ker.f; }
template <> inline unsigned int get_c<ROT_270>(const Kernel_3x3& ker) { return ker.i; }
template <> inline unsigned int get_d<ROT_270>(const Kernel_3x3& ker) { return ker.b; }
template <> inline unsigned int get_e<ROT_270>(const Kernel_3x3& ker) { return ker.e; }
template <> inline unsigned int get_f<ROT_270>(const Kernel_3x3& ker) { return ker.h; }
template <> inline unsigned int get_g<ROT_270>(const Kernel_3x3& ker) { return ker.a; }
template <> inline unsigned int get_h<ROT_270>(const Kernel_3x3& ker) { return ker.d; }
template <> inline unsigned int get_i<ROT_270>(const Kernel_3x3& ker) { return ker.g; }


/*
input kernel area naming convention:
-------------
| A | B | C |
----|---|---|
| D | E | F | //input pixel is at position E
----|---|---|
| G | H | I |
-------------
*/
template <class Scaler, RotationDegree rotDeg>
FORCE_INLINE //perf: quite worth it!
void scalePixel(const Kernel_3x3& ker,unsigned int* target, int trgWidth,
	unsigned char blendInfo //result of preprocessing all four corners of pixel "e"
	) {
	const unsigned char blend = rotateBlendInfo<rotDeg>(blendInfo);
	if (getBottomR(blend) >= BLEND_NORMAL) {
		unsigned int ee = get_e<rotDeg>(ker);
		unsigned int ff = get_f<rotDeg>(ker);
		unsigned int hh = get_h<rotDeg>(ker);
		unsigned int gg = get_g<rotDeg>(ker);
		unsigned int cc = get_c<rotDeg>(ker);
		unsigned int ii = get_i<rotDeg>(ker);
		//auto eq   = [&](unsigned int col1, unsigned int col2) { return colorDist(col1, col2, cfg.luminanceWeight_) < cfg.equalColorTolerance_; };
		//auto dist = [&](unsigned int col1, unsigned int col2) { return colorDist(col1, col2, cfg.luminanceWeight_); };
		bool doLineBlend = true;
		if (getBottomR(blend) >= BLEND_DOMINANT)
			doLineBlend = true;
		//make sure there is no second blending in an adjacent rotation for this pixel: handles insular pixels, mario eyes
		if (getTopR(blend) != BLEND_NONE && !((ee == gg)?(1):(distYCbCr(ee, gg) < 30))) //but support double-blending for 90 corners
			doLineBlend = false;
		if (getBottomL(blend) != BLEND_NONE && !((ee == cc)?(1):(distYCbCr(ee, cc) < 30)))
			doLineBlend = false;
		//no full blending for L-shapes; blend corner only (handles "mario mushroom eyes")
		if (((gg == hh)?(1):(distYCbCr(gg, hh) < 30)) && ((hh == ii)?(1):(distYCbCr(hh, ii) < 30)) && ((ii == ff)?(1):(distYCbCr(ii, ff) < 30)) && ((ff == cc)?(1):(distYCbCr(ff, cc) < 30)) && !((ee == ii)?(1):(distYCbCr(ee, ii) < 30)))
			doLineBlend = false;
		const unsigned int px = ((ee == ff)?(0):(distYCbCr(ee, ff))) <= ((ee == hh)?(0):(distYCbCr(ee, hh))) ? ff : hh; //choose most similar color
		OutputMatrix<Scaler::scale, rotDeg> out(target, trgWidth);
		if (doLineBlend) {
			const float fg = ((ff == gg)?(0):(distYCbCr(ff, gg))); //test sample: 70% of values max(fg, hc) / min(fg, hc) are between 1.1 and 3.7 with median being 1.9
			const float hc = ((hh == cc)?(0):(distYCbCr(hh, cc))); //
			const bool haveShallowLine = 2.2F * fg <= hc && ee != gg && get_d<rotDeg>(ker) != gg;
			const bool haveSteepLine   = 2.2F * hc <= fg && ee != cc && get_b<rotDeg>(ker) != cc;
			if (haveShallowLine) {
				if (haveSteepLine)
					Scaler::blendLineSteepAndShallow(px, out);
				else
					Scaler::blendLineShallow(px, out);
			}
			else {
				if (haveSteepLine)
					Scaler::blendLineSteep(px, out);
				else
					Scaler::blendLineDiagonal(px,out);
			}
		}
		else
			Scaler::blendCorner(px, out);
	}
}


template <class Scaler> //scaler policy: see "Scaler2x" reference implementation
void scaleImage(const unsigned int* src, unsigned int* trg, int srcWidth, int srcHeight, int yFirst, int yLast) {
	/*yFirst = ((yFirst<0)?(0):(yFirst));
	yLast  = ((!(srcHeight<yLast))?(yLast):(srcHeight));
	if (yFirst >= yLast || srcWidth <= 0)
		return;*/
	const int trgWidth = srcWidth * Scaler::scale;
	//"use" space at the end of the image as temporary buffer for "on the fly preprocessing": we even could use larger area of
	//"sizeof(uint32_t) * srcWidth * (yLast - yFirst)" bytes without risk of accidental overwriting before accessing
	const int bufferSize = srcWidth;
	unsigned char* preProcBuffer = reinterpret_cast<unsigned char*>(trg + yLast * Scaler::scale * trgWidth) - bufferSize;
	//std::fill(preProcBuffer, preProcBuffer + bufferSize, 0);
	//initialize preprocessing buffer for first row: detect upper left and right corner blending
	//this cannot be optimized for adjacent processing stripes; we must not allow for a memory race condition!
	if (yFirst > 0) {
		const int y = yFirst - 1;
		const unsigned int* s_m1 = src + srcWidth * ((y-1<0)?(0):(y-1));
		const unsigned int* s_0  = src + srcWidth * y; //center line
		const unsigned int* s_p1 = src + srcWidth * ((!(srcHeight-1<y+1))?(y+1):(srcHeight-1));
		const unsigned int* s_p2 = src + srcWidth * ((!(srcHeight-1<y+2))?(y+2):(srcHeight-1));
		for (int x = 0; x < srcWidth; ++x) {
			const int x_m1 = ((x-1<0)?(0):(x-1));
			const int x_p1 = ((!(srcWidth-1<x+1))?(x+1):(srcWidth-1));
			const int x_p2 = ((!(srcWidth-1<x+2))?(x+2):(srcWidth-1));
			Kernel_4x4 ker = {}; //perf: initialization is negligable
			ker.a = s_m1[x_m1]; //read sequentially from memory as far as possible
			ker.b = s_m1[x];
			ker.c = s_m1[x_p1];
			ker.d = s_m1[x_p2];
			ker.e = s_0[x_m1];
			ker.f = s_0[x];
			ker.g = s_0[x_p1];
			ker.h = s_0[x_p2];
			ker.i = s_p1[x_m1];
			ker.j = s_p1[x];
			ker.k = s_p1[x_p1];
			ker.l = s_p1[x_p2];
			ker.m = s_p2[x_m1];
			ker.n = s_p2[x];
			ker.o = s_p2[x_p1];
			ker.p = s_p2[x_p2];
			const BlendResult res = preProcessCorners(ker);
			/*
			preprocessing blend result:
			---------
			| F | G |   //evalute corner between F, G, J, K
			----|---|   //input pixel is at position F
			| J | K |
			---------
			*/
			setTopR(preProcBuffer[x], res.blend_j);
			if (x + 1 < srcWidth)
				setTopL(preProcBuffer[x + 1], res.blend_k);
		}
	}
	//------------------------------------------------------------------------------------
	unsigned int* tt; // new
	unsigned int ii, jj; // new
	for (int y = yFirst; y < yLast; ++y) {
		unsigned int* out = trg + Scaler::scale * y * trgWidth; //consider MT "striped" access
		const unsigned int* s_m1 = src + srcWidth * (y-((y>0)?(1):(0)));
		const unsigned int* s_0  = src + srcWidth * y; //center line
		const unsigned int* s_p1 = src + srcWidth * ((!(srcHeight-1<y+1))?(y+1):(srcHeight-1));
		const unsigned int* s_p2 = src + srcWidth * ((!(srcHeight-1<y+2))?(y+2):(srcHeight-1));
		unsigned char blend_xy1 = 0; //corner blending for current (x, y + 1) position
		for (int x = 0; x < srcWidth; ++x, out += Scaler::scale) {
			//all those bounds checks have only insignificant impact on performance!
			const int x_m1 = x-((x>0)?(1):(0)); //perf: prefer array indexing to additional pointers!
			const int x_p1 = ((!(x+1<srcWidth-1))?(srcWidth-1):(x+1));
			const int x_p2 = ((!(x+2<srcWidth-1))?(srcWidth-1):(x+2));
			//evaluate the four corners on bottom-right of current pixel
			unsigned char blend_xy = 0; { //for current (x, y) position
				Kernel_4x4 ker = {}; //perf: initialization is negligable
				ker.a = s_m1[x_m1]; //read sequentially from memory as far as possible
				ker.b = s_m1[x];
				ker.c = s_m1[x_p1];
				ker.d = s_m1[x_p2];
				ker.e = s_0[x_m1];
				ker.f = s_0[x];
				ker.g = s_0[x_p1];
				ker.h = s_0[x_p2];
				ker.i = s_p1[x_m1];
				ker.j = s_p1[x];
				ker.k = s_p1[x_p1];
				ker.l = s_p1[x_p2];
				ker.m = s_p2[x_m1];
				ker.n = s_p2[x];
				ker.o = s_p2[x_p1];
				ker.p = s_p2[x_p2];
				const BlendResult res = preProcessCorners(ker);
				/*
				preprocessing blend result:
				---------
				| F | G |   //evalute corner between F, G, J, K
				----|---|   //current input pixel is at position F
				| J | K |
				---------
				*/
				blend_xy = preProcBuffer[x];
				setBottomR(blend_xy, res.blend_f); //all four corners of (x, y) have been determined at this point due to processing sequence!
				setTopR(blend_xy1, res.blend_j); //set 2nd known corner for (x, y + 1)
				preProcBuffer[x] = blend_xy1; //store on current buffer position for use on next row
				blend_xy1 = 0;
				setTopL(blend_xy1, res.blend_k); //set 1st known corner for (x + 1, y + 1) and buffer for use on next column
				if (x + 1 < srcWidth) //set 3rd known corner for (x + 1, y)
					setBottomL(preProcBuffer[x + 1], res.blend_g);
			}
			//fill block of size scale * scale with the given color
			//fillBlock(out, trgWidth * sizeof(unsigned int), s_0[x], Scaler::scale); //place *after* preprocessing step, to not overwrite the results while processing the the last pixel!
			// new start
			for (ii = 0, tt = out; ii < Scaler::scale; ++ii, tt += trgWidth)
				for (jj = 0; jj < Scaler::scale; ++jj)
					tt[jj] = s_0[x];
			// end new
			//blend four corners of current pixel
			if (blend_xy != 0) { //good 20% perf-improvement
				Kernel_3x3 ker = {}; //perf: initialization is negligable
				ker.a = s_m1[x_m1]; //read sequentially from memory as far as possible
				ker.b = s_m1[x];
				ker.c = s_m1[x_p1];
				ker.d = s_0[x_m1];
				ker.e = s_0[x];
				ker.f = s_0[x_p1];
				ker.g = s_p1[x_m1];
				ker.h = s_p1[x];
				ker.i = s_p1[x_p1];
				scalePixel<Scaler, ROT_0  >(ker, out, trgWidth, blend_xy);
				scalePixel<Scaler, ROT_90 >(ker, out, trgWidth, blend_xy);
				scalePixel<Scaler, ROT_180>(ker, out, trgWidth, blend_xy);
				scalePixel<Scaler, ROT_270>(ker, out, trgWidth, blend_xy);
			}
		}
	}
}


struct Scaler2x {
	static const int scale = 2;
	template <class OutputMatrix>
	static void blendLineShallow(unsigned int col, OutputMatrix& out) {
		alphaBlend<1, 4>(out.template ref<scale - 1, 0>(), col);
		alphaBlend<3, 4>(out.template ref<scale - 1, 1>(), col);
	}
	template <class OutputMatrix>
	static void blendLineSteep(unsigned int col, OutputMatrix& out) {
		alphaBlend<1, 4>(out.template ref<0, scale - 1>(), col);
		alphaBlend<3, 4>(out.template ref<1, scale - 1>(), col);
	}
	template <class OutputMatrix>
	static void blendLineSteepAndShallow(unsigned int col, OutputMatrix& out) {
		alphaBlend<1, 4>(out.template ref<1, 0>(), col);
		alphaBlend<1, 4>(out.template ref<0, 1>(), col);
		alphaBlend<5, 6>(out.template ref<1, 1>(), col); //[!] fixes 7/8 used in xBR
	}
	template <class OutputMatrix>
	static void blendLineDiagonal(unsigned int col, OutputMatrix& out) {
		alphaBlend<1, 2>(out.template ref<1, 1>(), col);
	}
	template <class OutputMatrix>
	static void blendCorner(unsigned int col, OutputMatrix& out) {
		//model a round corner
		alphaBlend<21, 100>(out.template ref<1, 1>(), col); //exact: 1 - pi/4 = 0.2146018366
	}
};


struct Scaler3x {
	static const int scale = 3;
	template <class OutputMatrix>
	static void blendLineShallow(unsigned int col, OutputMatrix& out) {
		alphaBlend<1, 4>(out.template ref<scale - 1, 0>(), col);
		alphaBlend<1, 4>(out.template ref<scale - 2, 2>(), col);
		alphaBlend<3, 4>(out.template ref<scale - 1, 1>(), col);
		out.template ref<scale - 1, 2>() = col;
	}
	template <class OutputMatrix>
	static void blendLineSteep(unsigned int col, OutputMatrix& out) {
		alphaBlend<1, 4>(out.template ref<0, scale - 1>(), col);
		alphaBlend<1, 4>(out.template ref<2, scale - 2>(), col);
		alphaBlend<3, 4>(out.template ref<1, scale - 1>(), col);
		out.template ref<2, scale - 1>() = col;
	}
	template <class OutputMatrix>
	static void blendLineSteepAndShallow(unsigned int col, OutputMatrix& out) {
		alphaBlend<1, 4>(out.template ref<2, 0>(), col);
		alphaBlend<1, 4>(out.template ref<0, 2>(), col);
		alphaBlend<3, 4>(out.template ref<2, 1>(), col);
		alphaBlend<3, 4>(out.template ref<1, 2>(), col);
		out.template ref<2, 2>() = col;
	}
	template <class OutputMatrix>
	static void blendLineDiagonal(unsigned int col, OutputMatrix& out) {
		alphaBlend<1, 8>(out.template ref<1, 2>(), col);
		alphaBlend<1, 8>(out.template ref<2, 1>(), col);
		alphaBlend<7, 8>(out.template ref<2, 2>(), col);
	}
	template <class OutputMatrix>
	static void blendCorner(unsigned int col, OutputMatrix& out) {
		//model a round corner
		alphaBlend<45, 100>(out.template ref<2, 2>(), col); //exact: 0.4545939598
		//alphaBlend<14, 1000>(out.template ref<2, 1>(), col); //0.01413008627 -> negligable
		//alphaBlend<14, 1000>(out.template ref<1, 2>(), col); //0.01413008627
	}
};


struct Scaler4x {
	static const int scale = 4;
	template <class OutputMatrix>
	static void blendLineShallow(unsigned int col, OutputMatrix& out) {
		alphaBlend<1, 4>(out.template ref<scale - 1, 0>(), col);
		alphaBlend<1, 4>(out.template ref<scale - 2, 2>(), col);
		alphaBlend<3, 4>(out.template ref<scale - 1, 1>(), col);
		alphaBlend<3, 4>(out.template ref<scale - 2, 3>(), col);
		out.template ref<scale - 1, 2>() = col;
		out.template ref<scale - 1, 3>() = col;
	}
	template <class OutputMatrix>
	static void blendLineSteep(unsigned int col, OutputMatrix& out) {
		alphaBlend<1, 4>(out.template ref<0, scale - 1>(), col);
		alphaBlend<1, 4>(out.template ref<2, scale - 2>(), col);
		alphaBlend<3, 4>(out.template ref<1, scale - 1>(), col);
		alphaBlend<3, 4>(out.template ref<3, scale - 2>(), col);
		out.template ref<2, scale - 1>() = col;
		out.template ref<3, scale - 1>() = col;
	}
	template <class OutputMatrix>
	static void blendLineSteepAndShallow(unsigned int col, OutputMatrix& out) {
		alphaBlend<3, 4>(out.template ref<3, 1>(), col);
		alphaBlend<3, 4>(out.template ref<1, 3>(), col);
		alphaBlend<1, 4>(out.template ref<3, 0>(), col);
		alphaBlend<1, 4>(out.template ref<0, 3>(), col);
		alphaBlend<1, 3>(out.template ref<2, 2>(), col); //[!] fixes 1/4 used in xBR
		out.template ref<3, 3>() = out.template ref<3, 2>() = out.template ref<2, 3>() = col;
	}
	template <class OutputMatrix>
	static void blendLineDiagonal(unsigned int col, OutputMatrix& out) {
		alphaBlend<1, 2>(out.template ref<scale - 1, scale / 2    >(), col);
		alphaBlend<1, 2>(out.template ref<scale - 2, scale / 2 + 1>(), col);
		out.template ref<scale - 1, scale - 1>() = col;
	}
	template <class OutputMatrix>
	static void blendCorner(unsigned int col, OutputMatrix& out) {
		//model a round corner
		alphaBlend<68, 100>(out.template ref<3, 3>(), col); //exact: 0.6848532563
		alphaBlend< 9, 100>(out.template ref<3, 2>(), col); //0.08677704501
		alphaBlend< 9, 100>(out.template ref<2, 3>(), col); //0.08677704501
	}
};


} // namespace


////////////////////////////////////////////////////////////////////////////////
// Main caller
#ifdef _XBRZ_MT_
	DWORD WINAPI xbrz::scale(LPVOID lpParam) {
		unsigned char slice = *(unsigned char*)lpParam;
		if(_SCALER_ == 2)
			scaleImage<Scaler2x>((unsigned int*)picture_in32, (unsigned int*)picture_out32, MyRPO->SrcW, MyRPO->SrcH, (MyRPO->SrcH/NUM_SLICE)*slice, (MyRPO->SrcH/NUM_SLICE)*(slice+1));
		else if(_SCALER_ == 3)
			scaleImage<Scaler3x>((unsigned int*)picture_in32, (unsigned int*)picture_out32, MyRPO->SrcW, MyRPO->SrcH, (MyRPO->SrcH/NUM_SLICE)*slice, (MyRPO->SrcH/NUM_SLICE)*(slice+1));
		else // 4 or sth. else
			scaleImage<Scaler4x>((unsigned int*)picture_in32, (unsigned int*)picture_out32, MyRPO->SrcW, MyRPO->SrcH, (MyRPO->SrcH/NUM_SLICE)*slice, (MyRPO->SrcH/NUM_SLICE)*(slice+1));
		WORD* dst = (WORD*)MyRPO->DstPtr;
		unsigned int ccnt;
		ccnt=((MyRPO->DstH*MyRPO->DstW)/NUM_SLICE)*slice;
		dst+=(MyRPO->DstH/NUM_SLICE)*slice*pitchd;
		unsigned int i, j;
		// complete vmode unravel, since otherwise we'd need to check it in every line or pixel -> performance
		if(VideoFormat) {
			for(i=0; i<(MyRPO->DstH/NUM_SLICE); i++) {
				for(j=0; j<MyRPO->DstW; j++) {
					picture_out32[ccnt].rgbarr[2] = picture_out32[ccnt].rgbarr[2] >> 3;
					picture_out32[ccnt].rgbarr[1] = picture_out32[ccnt].rgbarr[1] >> 3;
					picture_out32[ccnt].rgbarr[0] = picture_out32[ccnt].rgbarr[0] >> 3;
					dst[j]=(picture_out32[ccnt].rgbarr[2] << 10) | (picture_out32[ccnt].rgbarr[1] << 5) | picture_out32[ccnt].rgbarr[0];
					ccnt++;
				}
				dst+=pitchd;
			}
		}
		else {
			for(i=0; i<(MyRPO->DstH/NUM_SLICE); i++) {
				for(j=0; j<MyRPO->DstW; j++) {
					picture_out32[ccnt].rgbarr[2] = picture_out32[ccnt].rgbarr[2] >> 3;
					picture_out32[ccnt].rgbarr[1] = picture_out32[ccnt].rgbarr[1] >> 2;
					picture_out32[ccnt].rgbarr[0] = picture_out32[ccnt].rgbarr[0] >> 3;
					dst[j]=(picture_out32[ccnt].rgbarr[2] << 11) | (picture_out32[ccnt].rgbarr[1] << 5) | picture_out32[ccnt].rgbarr[0];
					ccnt++;
				}
				dst+=pitchd;
			}
		}
		return 0;
	}
#else
	void xbrz::scale() {
		if(_SCALER_ == 2)
			scaleImage<Scaler2x>((unsigned int*)picture_in32, (unsigned int*)picture_out32, MyRPO->SrcW, MyRPO->SrcH, 0, MyRPO->SrcH);
		else if(_SCALER_ == 3)
			scaleImage<Scaler3x>((unsigned int*)picture_in32, (unsigned int*)picture_out32, MyRPO->SrcW, MyRPO->SrcH, 0, MyRPO->SrcH);
		else // 4 or sth. else
			scaleImage<Scaler4x>((unsigned int*)picture_in32, (unsigned int*)picture_out32, MyRPO->SrcW, MyRPO->SrcH, 0, MyRPO->SrcH);
	}
#endif
////////////////////////////////////////////////////////////////////////////////





////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
// Kega


// Kega helper function
static void rpi_strcpy(char* out, char* in) {
	while(1) {
		*out++=*in;
		if(!(*in++))
			break;
	}
}


extern "C" RENDER_PLUGIN_INFO* RenderPluginGetInfo(void) {
	// Provide a name for this Render Plugin (max 60 chars) as follows:
	// Name (Original Author)
	// Make sure the name is correct with respect to capitalisation, etc.
	// For example, this plugin is called "Double"
	// If you cannot fit the Original Authors name in then you may shorten the Plugin name,
	// BUT DO NOT GO OVER 60 CHARS.
	//                         "............................................................"
	//rpi_strcpy(&MyRPI.Name[0], "4xBRZ (Zenju)");
	// Set the Version Number and other flags.
	MyRPI.Flags=RPI_VERSION | RPI_555_SUPP | RPI_565_SUPP;
	if(_SCALER_ == 2) {
		MyRPI.Flags |= RPI_OUT_SCL2;
		#ifdef _XBRZ_MT_
			rpi_strcpy(&MyRPI.Name[0], "2xBRZ MT (Zenju)");
		#else
			rpi_strcpy(&MyRPI.Name[0], "2xBRZ (Zenju)");
		#endif
	}
	else if(_SCALER_ == 3) {
		MyRPI.Flags |= RPI_OUT_SCL3;
		#ifdef _XBRZ_MT_
			rpi_strcpy(&MyRPI.Name[0], "3xBRZ MT (Zenju)");
		#else
			rpi_strcpy(&MyRPI.Name[0], "3xBRZ (Zenju)");
		#endif
	}
	else { // 4 or sth. else
		MyRPI.Flags |= RPI_OUT_SCL4;
		#ifdef _XBRZ_MT_
			rpi_strcpy(&MyRPI.Name[0], "4xBRZ MT (Zenju)");
		#else
			rpi_strcpy(&MyRPI.Name[0], "4xBRZ (Zenju)");
		#endif
	}
	// Do any other setup required here.
	// Return pointer to the info structure.
	return(&MyRPI);
}


extern "C" void RenderPluginOutput(RENDER_PLUGIN_OUTP* rpo) {
	MyRPO = rpo;
	// Make sure I can use this renderer - in this case, width/height checks.
	if(((rpo->SrcW*_SCALER_)<=rpo->DstW) && ((rpo->SrcH*_SCALER_)<=rpo->DstH)) {
		if(rpo->Flags & RPI_565_SUPP)
			VideoFormat = false;
		else
			VideoFormat = true;
		pitchd = (unsigned int)rpo->DstPitch/sizeof(WORD);
		WORD* src = (WORD*)rpo->SrcPtr;
		WORD* dst = (WORD*)rpo->DstPtr;
		unsigned int pitchs = (int)rpo->SrcPitch/sizeof(WORD);
		unsigned int i, j;
		unsigned int ccnt=0;
		BYTE r, g, b;
		//////////////////////////////////////////////////
		// we prob. won't get a distinct speedup with sliced 16->32 conversion,
		// since the input picture is quite small and most cycles are used
		// for the main xBRZ conversion anyway...
		// complete vmode unravel, since otherwise we'd need to check it in every line or pixel -> performance
		if(VideoFormat) {
			for(i=0; i<rpo->SrcH; i++) {
				for(j=0; j<rpo->SrcW; j++) {
					r = (src[j] & 0x7C00) >> 10;
					g = (src[j] & 0x3E0) >> 5;
					b = (src[j] & 0x1F);
					picture_in32[ccnt].rgbarr[2] = (r << 3) | (r >> 2);
					picture_in32[ccnt].rgbarr[1] = (g << 3) | (g >> 2);
					picture_in32[ccnt].rgbarr[0] = (b << 3) | (b >> 2);
					ccnt++;
				}
				src+=pitchs;
			}
		}
		else {
			for(i=0; i<rpo->SrcH; i++) {
				for(j=0; j<rpo->SrcW; j++) {
					// alternative code (~5 % slower)
					/*
					picture_in32[ccnt].rgbarr[2] = (src[j] & 0xF800) >> 11;
					picture_in32[ccnt].rgbarr[1] = (src[j] & 0x7E0) >> 5;
					picture_in32[ccnt].rgbarr[0] = (src[j] & 0x1F);
					picture_in32[ccnt].rgbarr[2] = (picture_in32[ccnt].rgbarr[2] * 255) / 31;
					picture_in32[ccnt].rgbarr[1] = (picture_in32[ccnt].rgbarr[1] * 255) / 63;
					picture_in32[ccnt].rgbarr[0] = (picture_in32[ccnt].rgbarr[0] *255) / 31;
					*/
					r = (src[j] & 0xF800) >> 11;
					g = (src[j] & 0x7E0) >> 5;
					b = (src[j] & 0x1F);
					picture_in32[ccnt].rgbarr[2] = (r << 3) | (r >> 2);
					picture_in32[ccnt].rgbarr[1] = (g << 2) | (g >> 4);
					picture_in32[ccnt].rgbarr[0] = (b << 3) | (b >> 2);
					ccnt++;
				}
				src+=pitchs;
			}
		}
		//////////////////////////////////////////////////
		#ifdef _XBRZ_MT_
			HANDLE hThread_[NUM_SLICE];
			DWORD threadID_[NUM_SLICE];
			unsigned char slice_[NUM_SLICE];
			for(unsigned char c = 0; c < NUM_SLICE; c++)
				slice_[c] = c;
			for(i = 0; i < NUM_SLICE; i++)
				hThread_[i] = CreateThread(NULL, 0, xbrz::scale, &slice_[i], 0, &threadID_[i]);
			WaitForMultipleObjects(NUM_SLICE, hThread_, TRUE, INFINITE);
			for(i = 0; i < NUM_SLICE; i++)
				CloseHandle(hThread_[i]);
		#else
			xbrz::scale();
			ccnt=0;
			// complete vmode unravel, since otherwise we'd need to check it in every line or pixel -> performance
			if(VideoFormat) {
				for(i=0; i<rpo->DstH; i++) {
					for(j=0; j<rpo->DstW; j++) {
						picture_out32[ccnt].rgbarr[2] = picture_out32[ccnt].rgbarr[2] >> 3;
						picture_out32[ccnt].rgbarr[1] = picture_out32[ccnt].rgbarr[1] >> 3;
						picture_out32[ccnt].rgbarr[0] = picture_out32[ccnt].rgbarr[0] >> 3;
						dst[j]=(picture_out32[ccnt].rgbarr[2] << 10) | (picture_out32[ccnt].rgbarr[1] << 5) | picture_out32[ccnt].rgbarr[0];
						ccnt++;
					}
					dst+=pitchd;
				}
			}
			else {
				for(i=0; i<rpo->DstH; i++) {
					for(j=0; j<rpo->DstW; j++) {
						// alternative code (slower)
						/*
						picture_out32[ccnt].rgbint &= 0xF8FCF8;
						dst[j]=(picture_out32[ccnt].rgbarr[2] << 8) | (picture_out32[ccnt].rgbarr[1] << 3) | (picture_out32[ccnt].rgbarr[0] >> 3);
						*/
						picture_out32[ccnt].rgbarr[2] = picture_out32[ccnt].rgbarr[2] >> 3;
						picture_out32[ccnt].rgbarr[1] = picture_out32[ccnt].rgbarr[1] >> 2;
						picture_out32[ccnt].rgbarr[0] = picture_out32[ccnt].rgbarr[0] >> 3;
						dst[j]=(picture_out32[ccnt].rgbarr[2] << 11) | (picture_out32[ccnt].rgbarr[1] << 5) | picture_out32[ccnt].rgbarr[0];
						ccnt++;
					}
					dst+=pitchd;
				}
			}
		#endif
		//////////////////////////////////////////////////
		rpo->OutW=rpo->SrcW*_SCALER_;
		rpo->OutH=rpo->SrcH*_SCALER_;
	}
}



////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////

BOOL APIENTRY DllMain(HMODULE hModule, DWORD dwReason, LPVOID lpReserved) {
	if(dwReason == DLL_PROCESS_ATTACH) {
		// worst-case: 320*480 (interlaced) (not sure if interlaced is
		// forwarded to plugins by Kega at all, or won't work due to init
		// size check in RenderPluginOutput()), anyway, it's ~10 MiB for
		// the x4 version, should be available all time
		picture_in32 = new rgbpixel[320*480];
		ZeroMemory(picture_in32, 320*480*sizeof(rgbpixel));
		// worst-case: 320*480 *16 (4*4) (interlaced)
		picture_out32 = new rgbpixel[320*480*_SCALER_*_SCALER_];
		ZeroMemory(picture_out32, 320*480*_SCALER_*_SCALER_*sizeof(rgbpixel));
	}
	else if(dwReason == DLL_PROCESS_DETACH) {
		delete[] picture_in32;
		delete[] picture_out32;
	}
	return TRUE;
}

